import os
import jieba			# 先使用jieba进行分词吧...
import pandas as pd
import numpy as np

csvPath = os.path.abspath("pokemen.csv")
df = pd.read_csv(csvPath)

def cosin(serachVec, vec):
	num = 0.0
	serachVecNorm = 0.0
	vecNorm = 0.0
	# 分子结果
	for k in serachVec.keys():
		if k in vec.keys():
			num += serachVec[k] * vec[k]
	# 分母结果
	for k in serachVec.keys():
		serachVecNorm += serachVec[k] * serachVec[k]
	for k in vec.keys():
		vecNorm += vec[k] * vec[k]
	denom = np.math.sqrt(serachVecNorm) * np.math.sqrt(vecNorm)
	# 归一化到[0, 1]
	return 0.5 + 0.5 * (num / denom) if denom != 0 else 0

def idf(words):
	# idf字典
	idfFreq = {}
	# 总文档数
	wordsNum = len(words)
	# 每篇文档
	for word in words:
		# 包含w的文档数
		for w in word:
			# 已经计算w的idf则跳过
			if w in idfFreq.keys():
				continue
			# 记录w出现在几篇文档中
			cnt = 0
			for d in words:
				if w in d:
					cnt += 1
			# idf公式
			idfFreq[w] = np.math.log(wordsNum / (cnt + 1))
	# 返回idf字典
	return idfFreq

def tf(ws):
	# tf字典
	tfFreq = {}
	# 总词数
	wsNum = len(ws)
	# 每个字
	for w1 in ws:
		# w1已计算过
		if w1 in tfFreq.keys():
			continue
		# 统计w1在ws中出现的次数
		cnt = 0
		for w2 in ws:
			if w1 == w2:
				cnt += 1
		# tf公式
		tfFreq[w1] = cnt / wsNum
	# 返回tf字典
	return tfFreq

def tfidf(tfFreq, idfFreq):
	vec = {}
	for k in tfFreq.keys():
		if k in idfFreq.keys():
			vec[k] = tfFreq[k] * idfFreq[k]
		else:
			vec[k] = tfFreq[k] * 0.0
	return vec

def getRowStr(index):
	s = []
	columns = df.columns.values
	values = df.loc[index].values
	for i in range(1, len(columns)):
		s.append("{0}:{1}".format(str(columns[i]), str(values[i])))
	return " ".join(s)

def getJiebaWs(s):
	ws = []
	it = jieba.cut(s)
	for w in it:
		ws.append(w)
	return ws

def getWords():
	words = []
	for i in range(len(df)):
		words.append(getJiebaWs(getRowStr(i)))
	return words

def main():
	# 获取所有文档的列表形式
	words = getWords()
	# 计算语料库(这一步只需要一次即可, 也就是可以缓存本地)
	idfFreq = idf(words)
	# 搜索
	serach = getJiebaWs("雪")
	# tfidf向量
	serachVec = tfidf(tf(serach), idfFreq)
	# topk
	topk = []
	# 遍历所有文档, 计算向量(这里其实也可以先cache已有文档的vec)
	for i in range(len(words)):
		vec = tfidf(tf(words[i]), idfFreq)
		# 由于words和i是相关一对一的, 因此这里直接与索引对上(正常开发中不应该如此)
		topk.append((cosin(serachVec, vec), i))
	# 找到权重最大的k个
	topk = sorted(topk, key=lambda x : x[0], reverse=True)
	for v in topk[:5]:
		print(v[0])				# 相似度结果
		print(getRowStr(v[1]))	# 文本结果

if "__main__" == __name__:
	main()
	exit(0)